In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
import matplotlib.pyplot as plt


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Load CSV file including 100k events
print("Loading dataset...")
df = pd.read_csv('/home/edaerdogan/Desktop/dimuonai/dimuon.csv')


Loading dataset...


In [3]:
#filtering and labeling 
signal_df = df[df['Q1'] * df['Q2'] < 0].copy()
background_df = df[df['Q1'] * df['Q2'] >= 0].copy() 

signal_df['target'] = 0
background_df['target'] = 1

filtered_df = pd.concat([signal_df, background_df], ignore_index=True)
features = filtered_df[['pt1', 'pt2', 'eta1', 'eta2', 'phi1', 'phi2', 'Q1', 'Q2']] #Since M is given as a parameter for both the muons, it is not taken as a parameter
target = filtered_df['target']



In [4]:
train_data, test_data, train_target, test_target = train_test_split(features, target, test_size=0.2, random_state=42)


In [5]:
signal_count = len(df[df['Q1'] * df['Q2'] < 0])
background_count = len(df[df['Q1'] * df['Q2'] >= 0])

total_signal = signal_count
total_background = background_count
total_events = total_signal + total_background

# Calculate percentages
signal_percentage = total_signal / total_events * 100
background_percentage = total_background / total_events * 100

print(f'Total Signal Events: {total_signal}')
print(f'Total Background Events: {total_background}')
print(f'Signal Percentage: {signal_percentage}%')
print(f'Background Percentage: {background_percentage}%')

Total Signal Events: 62214
Total Background Events: 37786
Signal Percentage: 62.214000000000006%
Background Percentage: 37.785999999999994%


In [6]:
features = filtered_df[['pt1', 'pt2', 'eta1', 'eta2', 'phi1', 'phi2', 'Q1', 'Q2']]
target = filtered_df['target']

X = features.to_numpy(dtype='float')
Y = target.to_numpy(dtype='float')

In [7]:
x = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(Y, dtype=torch.float32).reshape(-1, 1)
print(x)
print(y)

tensor([[ 6.2360,  2.3905, -0.5848,  ..., -2.2764, -1.0000,  1.0000],
        [ 8.9484,  6.7821, -1.3530,  ..., -0.3814, -1.0000,  1.0000],
        [ 4.0910,  4.8186,  1.2463,  ...,  2.2493, -1.0000,  1.0000],
        ...,
        [11.4290,  1.1869,  1.8963,  ..., -1.6911, -1.0000, -1.0000],
        [ 9.4401,  2.8993,  1.3652,  ..., -0.5243,  1.0000,  1.0000],
        [ 3.3954, 12.6496,  2.3445,  ..., -1.8854,  1.0000,  1.0000]])
tensor([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])


In [8]:
# define the model
# define the model
model = nn.Sequential(
    nn.Linear(8, 100),
    nn.ReLU(),
    nn.Linear(100, 50),
    nn.ReLU(),
    nn.Linear(50, 20),
    nn.ReLU(),
    nn.Linear(20, 1),
    nn.Sigmoid()
)
print(model)
# train the model
loss_fn   = nn.BCELoss()  # binary cross entropy
optimizer = optim.Adam(model.parameters(), lr=0.001) #lr değiştirrilebilir mi 
 
n_epochs = 20 #e:15 b:32 accuracy: 0.377, e:15 b:64 a:0.377, e:15 b:128 a:0.377, e:30, b:128 a:0.97; e:20 b:128 a:0.99(best result) (20-80 included)
batch_size = 128
 
for epoch in range(n_epochs):
    for i in range(0, len(X), batch_size):
        Xbatch = x[i:i+batch_size]
        y_pred = model(Xbatch)
        ybatch = y[i:i+batch_size]
        loss = loss_fn(y_pred, ybatch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Finished epoch {epoch}, latest loss {loss}')
 
# compute accuracy (no_grad is optional)
with torch.no_grad():
    y_pred = model(x)
accuracy = (y_pred.round() == y).float().mean()
print(f"Accuracy {accuracy}")


Sequential(
  (0): Linear(in_features=8, out_features=100, bias=True)
  (1): ReLU()
  (2): Linear(in_features=100, out_features=50, bias=True)
  (3): ReLU()
  (4): Linear(in_features=50, out_features=20, bias=True)
  (5): ReLU()
  (6): Linear(in_features=20, out_features=1, bias=True)
  (7): Sigmoid()
)


Finished epoch 0, latest loss 0.0002693688729777932
Finished epoch 1, latest loss 0.0006065780180506408
Finished epoch 2, latest loss 2.141408003808465e-05
Finished epoch 3, latest loss 1.4741329323442187e-05
Finished epoch 4, latest loss 1.323141623288393e-05
Finished epoch 5, latest loss 0.0014532123459503055
Finished epoch 6, latest loss 3.9804373955121264e-05
Finished epoch 7, latest loss 0.0015726550482213497
Finished epoch 8, latest loss 0.0008910330361686647
Finished epoch 9, latest loss 6.592884165002033e-05
Finished epoch 10, latest loss 0.00011740001355065033
Finished epoch 11, latest loss 0.00024978184956125915
Finished epoch 12, latest loss 9.714773477753624e-05
Finished epoch 13, latest loss 5.18995730089955e-05
Finished epoch 14, latest loss 3.1773852242622524e-05
Finished epoch 15, latest loss 5.7518877838447224e-06
Finished epoch 16, latest loss 1.0058285937475375e-07
Finished epoch 17, latest loss 3.645959077402949e-05
Finished epoch 18, latest loss 2.059011603705585e-