In [1]:
import torch
import time
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
import tqdm.auto as tqdm

In [2]:
print("Using torch", torch.__version__)

Using torch 1.11.0+cu102


In [3]:
gpu_avail = torch.cuda.is_available()
print(f"Is the GPU available? {gpu_avail}")

Is the GPU available? True


In [4]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("Device", device)

Device cuda


In [42]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):

    def __init__(self,input_dim):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_dim, 10)  # 5*5 from image dimension
        self.fc1_bn=nn.BatchNorm1d(10)
        self.fc2 = nn.Linear(10, 13)
        self.fc2_bn=nn.BatchNorm1d(13)
        self.fc3 = nn.Linear(13, 21)
        self.fc3_bn=nn.BatchNorm1d(21)
        self.fc4 = nn.Linear(21, 21)
        self.fc4_bn = nn.BatchNorm1d(21)
        self.fc5 = nn.Linear(21, 15)
        self.fc5_bn=nn.BatchNorm1d(15)
        self.fc6 = nn.Linear(15, 8)
        self.fc6_bn=nn.BatchNorm1d(8)
        self.fc7 = nn.Linear(8, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc1_bn(x)
        x = F.relu(self.fc2(x))
        x = self.fc2_bn(x)
        x = F.relu(self.fc3(x))
        x = self.fc3_bn(x)
        x = F.relu(self.fc4(x))
        x = self.fc4_bn(x)
        x = F.relu(self.fc5(x))
        x = self.fc5_bn(x)
        x = F.relu(self.fc6(x))
        x = self.fc6_bn(x)
        x = F.sigmoid(self.fc7(x))
        return x

## Making The Data

In [43]:
files = ['ttbar','wmp','wpwm','zwpm','n2n2']

cs_pb = [2.558,36.8,2.9*1e-1,7.33*1e-2,3.99*1e-4]
cs = [i*1e3 for i in cs_pb]
k_f = [1.954,1.356,1.92,2.09,1.0]

cs_corr = {files[i] : cs[i]*k_f[i] for i in range(len(files))}

In [44]:
df = []
for f in range(len(files)):
    con_df = []
    
    for i in range(1,11):
        con_df.append(pd.read_csv('~/Datasets/Comp/' + files[f] + str(i) + '.csv'))
    
    df.append(pd.concat(con_df,ignore_index=True))
    df[-1]['type'] = f
    
    if files[f] == "n2n2":
        df[-1]['tag'] = 1
    else:
        df[-1]['tag'] = 0

dtset = pd.concat(df,ignore_index=True)
dtset = shuffle(dtset)
dtset['met'] = np.fabs(dtset['met'])

In [8]:
del(df)

In [9]:
## Analysis Level Cuts
dtset = dtset[dtset['ptl'] >= 120.0][dtset['ptj'] >= 120.0][dtset['etaj'] <= 2.0][dtset['etaj'] >= -2.0]

  dtset = dtset[dtset['ptl'] >= 120.0][dtset['ptj'] >= 120.0][dtset['etaj'] <= 2.0][dtset['etaj'] >= -2.0]


In [10]:
train_len = int(0.8*len(dtset))
x_train = dtset.T[:-2].T[:train_len]
y_train = dtset['tag'][:train_len]

x_test = dtset.T[:-2].T[train_len:]
y_test = dtset['tag'][train_len:]

print('Shapes : ',x_train.shape,y_train.shape,x_test.shape,y_test.shape)

Shapes :  (2296690, 17) (2296690,) (574173, 17) (574173,)


In [11]:
del(dtset)

In [12]:
# Transferring to CUDA
x_train = torch.from_numpy(x_train.values)
y_train = torch.from_numpy(y_train.values)

In [13]:
x_train = x_train.double()
y_train = y_train.double()

## ML

In [45]:
model = Net(17)
criterion = nn.BCELoss()
model.double()
model.to(device)

Net(
  (fc1): Linear(in_features=17, out_features=10, bias=True)
  (fc1_bn): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=10, out_features=13, bias=True)
  (fc2_bn): BatchNorm1d(13, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=13, out_features=21, bias=True)
  (fc3_bn): BatchNorm1d(21, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc4): Linear(in_features=21, out_features=21, bias=True)
  (fc4_bn): BatchNorm1d(21, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc5): Linear(in_features=21, out_features=15, bias=True)
  (fc5_bn): BatchNorm1d(15, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc6): Linear(in_features=15, out_features=8, bias=True)
  (fc6_bn): BatchNorm1d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc7): Linear(in_features=8, out_features=1, bias=True)
)

In [46]:
learning_rate = 1e-3

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [47]:
batch_size = 256
# n_iters = 3000000
# num_epochs = n_iters / (len(x_train) / batch_size)
# num_epochs = int(num_epochs)
num_epochs = 10

In [48]:
class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, x_train, y_train):
        self.x_train = x_train
        self.y_train = y_train

    def __getitem__(self, idx):
        return (self.x_train[idx], self.y_train[idx])

    def __len__(self):
        return len(self.x_train)

In [49]:
ml_ds = SimpleDataset(x_train, y_train)
loader = torch.utils.data.DataLoader(ml_ds, batch_size=batch_size, shuffle=True)

In [50]:
class AverageMeter:
    def __init__(self):
        self.avg = 0
        self.count = 0

    def update(self, avg, count):
        sum = avg * count
        total_sum = self.avg * self.count
        total_sum += sum
        self.count += count
        self.avg = total_sum / self.count

    def reset(self):
        self.avg = 0
        self.count = 0 

In [51]:
loss_avg_meter = AverageMeter()
acc_avg_meter = AverageMeter()

In [52]:
# iter = 0
for epoch in range(num_epochs):
    loss_avg_meter.reset()
    acc_avg_meter.reset()
    tqdm_iter = tqdm.tqdm(loader, total=len(loader))
    tqdm_iter.set_description(f"Epoch {epoch}")
    for batch in tqdm_iter:
        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()
        

        # Forward pass to get output/logits
        x, y = batch
        x = x.to(device)
        y = y.to(device)
        
        outputs = model(x)

        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, y.unsqueeze(-1))

        # Getting gradients w.r.t. parameters
        loss.backward()

        # Updating parameters
        optimizer.step()

        loss_avg_meter.update(loss.item(), x.size(0))

        # Accuracy calculation
        acc = ((outputs >= 0.5).squeeze() == y).float().mean()
        acc_avg_meter.update(acc.item(), x.size(0))

        tqdm_iter.set_postfix(loss=loss_avg_meter.avg, acc=acc_avg_meter.avg)


        # iter += 1

        # if iter % 500 == 0:
        #     # Calculate Accuracy         
        #     correct = 0
        #     total = 0
        #     # Iterate through test dataset
        #     for j in range(len(x_train)):
        #         # Forward pass only to get logits/output
        #         outputs = model(x_train[i].reshape((-1,1)))

        #         # Get predictions from the maximum value
        #         _, predicted = torch.max(outputs.data, 1)

        #         # Total number of labels
        #         total += y_train[i].size(0)

        #         # Total correct predictions
        #         correct += (predicted == y_train[i]).sum()

        #     accuracy = 100 * correct / total

            # Print Loss
            # print('Iteration: {}. Loss: {}. Accuracy: {}'.format(iter, loss.item(), accuracy))



  0%|          | 0/8972 [00:00<?, ?it/s]



  0%|          | 0/8972 [00:00<?, ?it/s]

  0%|          | 0/8972 [00:00<?, ?it/s]

  0%|          | 0/8972 [00:00<?, ?it/s]

  0%|          | 0/8972 [00:00<?, ?it/s]

  0%|          | 0/8972 [00:00<?, ?it/s]

  0%|          | 0/8972 [00:00<?, ?it/s]

  0%|          | 0/8972 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [54]:
x_tot = torch.from_numpy(dtset.T[:-2].T.values)
res = model(x)



In [58]:
pred_set = dtset.copy()
pred_set['pred'] = res

ValueError: Length of values (256) does not match length of index (3300390)