In [None]:
import train
import utils
import test
import pandas as pd
import numpy as np
from sklearn import train_test_split
from matplotlib.pyplot import plt
import json
from models import classifier, vmae

In [None]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz
!gzip -d HIGGS.csv.gz

In [None]:
data = pd.read_csv('./HIGGS.csv', header=None)

In [None]:
X = data.iloc[:,1:]
y = data.iloc[:,0]

In [None]:
X = np.array(X, dtype=np.float32)
y = np.array(y, dtype=np.float32)

In [None]:
print(X.shape)

In [None]:
X = np.delete(X, obj=np.s_[X.shape[1]-7:X.shape[1]], axis=-1)
X = np.insert(X, obj=np.s_[3:4], values=0, axis=-1)
X = np.insert(X, obj=np.s_[5:6], values=0, axis=-1)
X = np.insert(X, obj=np.s_[7:8], values=0, axis=-1)

In [None]:
names = ["lepton pT", "lepton eta", "lepton phi", "Zero Padding",
         "missing energy magnitude", "Zero Padding", "missing energy phi", "Zero Padding",
         "jet 1 pt", "jet 1 eta", "jet 1 phi", "jet 1 b-tag",
         "jet 2 pt", "jet 2 eta", "jet 2 phi", "jet 2 b-tag",
         "jet 3 pt", "jet 3 eta", "jet 3 phi", "jet 3 b-tag",
         "jet 4 pt", "jet 4 eta", "jet 4 phi", "jet 4 b-tag"]

fig, axes = plt.subplots(6,4, figsize=(15,20))
fig.tight_layout(pad=4)

X_higgs = X[y==1]
X_nohiggs = X[y==0]
y_higgs = y[y==1]
y_nohiggs = y[y==0]

for i in range(len(axes)):
    for j in range(len(axes[0])):
        axes[i,j].hist(X_higgs[:,i*4+j], histtype='step', stacked=False, density=1, bins=30, label='Higgs')
        axes[i,j].hist(X_nohiggs[:,i*4+j], histtype='step', stacked=False, density=1, bins=30, label='Background')
        axes[i,j].set_title(names[i*4+j])
        axes[i,j].set_xlabel(names[i*4+j] + ' scaled')
        axes[i,j].legend()
        axes[i,j].set_ylabel("Proportion of data (density=1)")
plt.show()

In [None]:
print(len(X_higgs))
print(len(X_nohiggs))

In [None]:
labels_hh = np.ones(len(X_higgs))
labels_tt = np.zeros(len(X_nohiggs))

In [None]:
X_hh_train, X_val1, labels_hh_train, labels_val1 = train_test_split(X_higgs, labels_hh, test_size=0.2, random_state=42)
X_hh_val, X_hh_test, labels_hh_val, labels_hh_test = train_test_split(X_val1, labels_val1, test_size=0.5, random_state=42)

In [None]:
# Normalize data
x_scaler = StandardScaler()
X_hh_train = x_scaler.fit_transform(X_hh_train)
X_hh_val = x_scaler.transform(X_hh_val)
X_hh_test = x_scaler.transform(X_hh_test)
X_nohiggs = x_scaler.transform(X_nohiggs)

In [None]:
X_tt_train, X_val1, labels_tt_train, labels_val1 = train_test_split(X_nohiggs, labels_tt, test_size=0.2, random_state=42)
X_tt_val, X_tt_test, labels_tt_val, labels_tt_test = train_test_split(X_val1, labels_val1, test_size=0.5, random_state=42)

In [None]:
print(np.shape(X_hh_train), np.shape(X_tt_train))

In [None]:
X_hh_train = X_hh_train.reshape(X_hh_train.shape[0],6,4)
X_hh_val = X_hh_val.reshape(X_hh_val.shape[0],6,4)
X_hh_test = X_hh_test.reshape(X_hh_test.shape[0],6,4)
X_tt_train = X_tt_train.reshape(X_tt_train.shape[0],6,4)
X_tt_val = X_tt_val.reshape(X_tt_val.shape[0],6,4)
X_tt_test = X_tt_test.reshape(X_tt_test.shape[0],6,4)

phi_limit = np.max((abs(np.max(X_hh_train[:,0,2])), abs(np.min(X_hh_train[:,0,2])))) # Only need one value assuming phi distrubtions are all uniform
lower_pt_limit = [np.min(X_hh_train[:,0,0]), np.min(X_hh_train[:,1,0]), np.min(X_hh_train[:,2,0]), np.min(X_hh_train[:,3,0]), np.min(X_hh_train[:,4,0]), np.min(X_hh_train[:,5,0])]

In [None]:
X_train = np.append(X_hh_train, X_tt_train, axis=0)
X_val = np.append(X_hh_val, X_tt_val, axis=0)
X_test = np.append(X_hh_test, X_tt_test, axis=0)
labels_train = np.append(labels_hh_train, labels_tt_train, axis=0)
labels_val = np.append(labels_hh_val, labels_tt_val, axis=0)
labels_test = np.append(labels_hh_test, labels_tt_test, axis=0)

In [None]:
with open('./configs/uci_higgs.json', 'r') as f:
    config = json.load(f)

In [None]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Build the autoencoder model
tae = vmae.TransformerAutoencoder(config['d_model'], config['num_heads'], config['num_layers'], config['d_ff'], config['max_seq_len'], config['dropout'], config['device'])
# Build classifier
classifier = classifier.BinaryClassifier(config['class_input_features'], config['class_ff_dim'])

In [None]:
tae.to(device)
classifier.to(device)

In [None]:
# Assign the criterion
criterion = utils.custom_loss(phi_limit, config['alpha'], config['beta'], config['gamma'])
criterion_2 = nn.BCELoss()

In [None]:
# Instantiate the dataset and DataLoader
train_data = utils.DataLabelDataset(X_train, labels_train)
train_loader = DataLoader(train_data, batch_size=config['batch_size'], shuffle=True)
val_data = utils.DataLabelDataset(X_val, labels_val)
val_loader = DataLoader(val_data, batch_size=config['test_batch_size'], shuffle=True)
test_data = utils.DataLabelDataset(X_test, labels_test)
test_loader = DataLoader(test_data, batch_size=config['test_batch_size'], shuffle=False)

In [None]:
optimizer = utils.SGDWithSaturatingMomentumAndDecay(tae.parameters(), lr=init_lr, momentum=config['min_momentum'], max_momentum=config['max_momentum'], epochs_to_saturate=config['epochs_to_saturate'], batches_per_epoch=len(train_loader), weight_decay=0, lr_decay=lr_decay, min_lr=min_lr, resume_epoch=resume_epoch)
optimizer_2 = utils.SGDWithSaturatingMomentumAndDecay(list(tae.parameters())+list(classifier.parameters()), lr=init_lr, momentum=config['min_momentum'], max_momentum=config['max_momentum'],  epochs_to_saturate=config['epochs_to_saturate'], batches_per_epoch=len(train_loader), weight_decay=config['weight_decay'], lr_decay=config['lr_decay'], min_lr=config['min_lr'], resume_epoch=config['resume_epoch'])

In [None]:
val_loss_min, val_loss_min_2 = train.train(train_loader, val_loader, tae, classifier, optimizer, optimizer_2, criterion, criterion_2, mask=config['mask'], num_epochs=config['num_epochs']-config['resume_epoch'], save_path='./saved_models/uci_higgs')

In [None]:
test.test(test_loader, test_batch_size, X_test, labels_test, tae, classifier, criterion, mask, x_scaler)