In [1]:
import sqlite3 as sql

import numpy as np
import pandas as pd
from pandas import cut, read_sql
import pickle as pkl
from random import choices
from sklearn import metrics
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import scipy.optimize as optimize

In [2]:
indir_db = "/groups/icecube/petersen/GraphNetDatabaseRepository/osc_next_database_Peter_and_Morten/merged_database/osc_next_level3_v2.00_genie_muongun_noise_120000_140000_160000_130000_888003.db"
outdir = "/groups/icecube/petersen/GraphNetDatabaseRepository/osc_next_database_Peter_and_Morten/Selections/New_selections_Peter/"

#Load in truth data
with sql.connect(indir_db) as con:
    query = """
    SELECT
        event_no, pid, interaction_type
    FROM 
        truth
    """
    truth_data = read_sql(query,con)

In [25]:
electron_neutrinos_CC = truth_data[(truth_data['pid'].isin((-12,12))) & (truth_data['interaction_type']==1)]
electron_neutrinos_NC = truth_data[(truth_data['pid'].isin((-12,12))) & (truth_data['interaction_type']==2)]
muon_neutrinos_CC = truth_data[(truth_data['pid'].isin((-14,14))) & (truth_data['interaction_type']==1)]
muon_neutrinos_NC = truth_data[(truth_data['pid'].isin((-14,14))) & (truth_data['interaction_type']==2)]
tau_neutrinos_CC = truth_data[(truth_data['pid'].isin((-16,16))) & (truth_data['interaction_type']==1)]
tau_neutrinos_NC = truth_data[(truth_data['pid'].isin((-16,16))) & (truth_data['interaction_type']==2)]
muons = truth_data[(truth_data['pid'].isin((-13,13)))]
noise = truth_data[(truth_data['pid'].isin((-1,1)))]

In [26]:
print('e CC: ',len(electron_neutrinos_CC))
print('e NC: ',len(electron_neutrinos_NC))
print('mu neu CC: ',len(muon_neutrinos_CC))
print('mu neu NC: ',len(muon_neutrinos_NC))
print('tau CC: ',len(tau_neutrinos_CC))
print('tau NC: ',len(tau_neutrinos_NC))
print('muons: ',len(muons))
print('noise: ',len(noise))
print('total neutrinos',len(truth_data[(truth_data['pid'].isin((-12,12,-14,14,-16,16)))]))
print('total events',len(truth_data))

e CC:  7457888
e NC:  840825
mu neu CC:  18220448
mu neu NC:  1886504
tau CC:  6510308
tau NC:  2375773
muons:  15990173
noise:  15186239
total neutrinos 37294941
total events 68471353


In [27]:
electron_neutrinos_CC = electron_neutrinos_CC.sample(frac=1)
electron_neutrinos_NC = electron_neutrinos_NC.sample(frac=1)
muon_neutrinos_CC = muon_neutrinos_CC.sample(frac=1)
muon_neutrinos_NC = muon_neutrinos_NC.sample(frac=1)
tau_neutrinos_CC = tau_neutrinos_CC.sample(frac=1)
tau_neutrinos_NC = tau_neutrinos_NC.sample(frac=1)
muons = muons.sample(frac=1)
noise = noise.sample(frac=1)

In [28]:
electron_neutrinos_CC_split = np.array_split(electron_neutrinos_CC, (100000,120000))
electron_neutrinos_NC_split = np.array_split(electron_neutrinos_NC, (100000,120000))
muon_neutrinos_CC_split = np.array_split(muon_neutrinos_CC, (600000,720000))
muon_neutrinos_NC_split = np.array_split(muon_neutrinos_NC, (100000,120000))
tau_neutrinos_CC_split = np.array_split(tau_neutrinos_CC, (100000,120000))
tau_neutrinos_NC_split = np.array_split(tau_neutrinos_NC, (100000,120000))
muons_split = np.array_split(muons, (600000,720000))
noise_split = np.array_split(noise, (600000,720000))

In [29]:
Multiclassification_train = pd.concat([electron_neutrinos_CC_split[0],electron_neutrinos_NC_split[0],muon_neutrinos_CC_split[0][:100000],muon_neutrinos_NC_split[0],tau_neutrinos_CC_split[0],tau_neutrinos_NC_split[0],muons_split[0],noise_split[0]],ignore_index=True)
Multiclassification_val = pd.concat([electron_neutrinos_CC_split[1],electron_neutrinos_NC_split[1],muon_neutrinos_CC_split[1].head(20000),muon_neutrinos_NC_split[1],tau_neutrinos_CC_split[1],tau_neutrinos_NC_split[1],muons_split[1],noise_split[1]],ignore_index=True)
Multiclassification_test = pd.concat([electron_neutrinos_CC_split[2],electron_neutrinos_NC_split[2],muon_neutrinos_CC_split[2],muon_neutrinos_NC_split[2],tau_neutrinos_CC_split[2],tau_neutrinos_NC_split[2],muons_split[2],noise_split[2]],ignore_index=True)

In [30]:
print(Multiclassification_train.pid.value_counts())
print(Multiclassification_val.pid.value_counts())
print(Multiclassification_test.pid.value_counts())

 13.0    600000
-1.0     600000
 16.0    147684
 12.0    144045
 14.0    143169
-14.0     56831
-12.0     55955
-16.0     52316
Name: pid, dtype: int64
 13.0    120000
-1.0     120000
 16.0     29646
 12.0     28790
 14.0     28606
-14.0     11394
-12.0     11210
-16.0     10354
Name: pid, dtype: int64
 13.0    15270173
-1.0     14466239
 14.0    13257627
 16.0     6278118
-14.0     6009325
 12.0     5575391
-12.0     2483322
-16.0     2367963
Name: pid, dtype: int64


In [31]:
Multiclassification_train = Multiclassification_train.sample(frac=1)
Multiclassification_val = Multiclassification_val.sample(frac=1)
Multiclassification_test = Multiclassification_test.sample(frac=1)

In [32]:
Multiclassification_train.to_csv(path_or_buf = outdir+'Multiclassification_train_event_no.csv',columns=(['event_no']),index=False)
Multiclassification_val.to_csv(path_or_buf = outdir+'Multiclassification_val_event_no.csv',columns=(['event_no']),index=False)
Multiclassification_test.to_csv(path_or_buf = outdir+'Multiclassification_test_event_no.csv',columns=(['event_no']),index=False)


In [33]:
Track_cascade_train = pd.concat([electron_neutrinos_CC_split[0],electron_neutrinos_NC_split[0],muon_neutrinos_CC_split[0][:300000],muon_neutrinos_NC_split[0]],ignore_index=True)
Track_cascade_val = pd.concat([electron_neutrinos_CC_split[1],electron_neutrinos_NC_split[1],muon_neutrinos_CC_split[1].head(60000),muon_neutrinos_NC_split[1]],ignore_index=True)
Track_cascade_test = pd.concat([electron_neutrinos_CC_split[2],electron_neutrinos_NC_split[2],muon_neutrinos_CC_split[2],muon_neutrinos_NC_split[2],tau_neutrinos_CC_split[2],tau_neutrinos_NC_split[2]],ignore_index=True)

In [34]:
Track_cascade_train = Track_cascade_train.sample(frac=1)
Track_cascade_val = Track_cascade_val.sample(frac=1)
Track_cascade_test = Track_cascade_test.sample(frac=1)

In [35]:
print(Track_cascade_train.pid.value_counts())
print(Track_cascade_val.pid.value_counts())
print(Track_cascade_test.pid.value_counts())
print(Track_cascade_train.interaction_type.value_counts())
print(Track_cascade_val.interaction_type.value_counts())
print(Track_cascade_test.interaction_type.value_counts())

 14.0    279683
 12.0    144045
-14.0    120317
-12.0     55955
Name: pid, dtype: int64
 14.0    55842
 12.0    28790
-14.0    24158
-12.0    11210
Name: pid, dtype: int64
 14.0    13257627
 16.0     6278118
-14.0     6009325
 12.0     5575391
-12.0     2483322
-16.0     2367963
Name: pid, dtype: int64
1.0    400000
2.0    200000
Name: interaction_type, dtype: int64
1.0    80000
2.0    40000
Name: interaction_type, dtype: int64
1.0    31228644
2.0     4743102
Name: interaction_type, dtype: int64


In [36]:
Track_cascade_train.to_csv(path_or_buf = outdir+'Track_cascade_train_event_no.csv',columns=(['event_no']),index=False)
Track_cascade_val.to_csv(path_or_buf = outdir+'Track_cascade_val_event_no.csv',columns=(['event_no']),index=False)
Track_cascade_test.to_csv(path_or_buf = outdir+'Track_cascade_test_event_no.csv',columns=(['event_no']),index=False)


In [37]:
print(Multiclassification_train.head(10))
print(Track_cascade_train.head(10))

         event_no   pid  interaction_type
764033   39808084  13.0              -1.0
1072161  15334024  13.0              -1.0
358294   22651494  14.0               2.0
1513539  32833043  -1.0              -1.0
1654301  33382160  -1.0              -1.0
958773   26007766  13.0              -1.0
474611   41091250  16.0               1.0
571377   12440629  16.0               2.0
237334   25411594 -14.0               1.0
446560   46283444  16.0               1.0
        event_no   pid  interaction_type
202196  16557701 -14.0               1.0
274988  11060250 -14.0               1.0
387004  37729686  14.0               1.0
599621   6956905  14.0               2.0
558763  49597352  14.0               2.0
6245    10291955 -12.0               1.0
461951  36045979  14.0               1.0
594561  20625496  14.0               2.0
98505   18569474  12.0               1.0
413390  48232365  14.0               1.0
