In [None]:
import os
import glob
import numpy as np
import pandas as pd
import joblib
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier


def load_data(file_paths, track_scaler, detid_encoder):
    df_list = [pd.read_csv(file_path) for file_path in file_paths]
    df = pd.concat(df_list, ignore_index=True)
    df['det_raw_id'] = df['det_raw_id'].apply(lambda x: list(map(int, x.split(' ')[:-1])))

    X = track_scaler.transform(df[['track_pt', 'track_eta', 'track_phi']].values)
    Y = detid_encoder.transform(df['det_raw_id'])
    return X, Y

In [2]:
data_dir = '/eos/user/j/joshin/TrackDetMatchmaker/TrackDetMatches'
detid_table_path = '/afs/cern.ch/user/j/joshin/public/TrackDetMatchmaker/CMSSW_14_2_0/src/TrackDetMatchmaker/Detector/test/det_raw_id.csv'

In [32]:
track_paths = sorted(glob.glob(os.path.join(data_dir, '*.csv')))
n_files = 1000
sample_paths = track_paths[:n_files]

df_list = [pd.read_csv(p) for p in sample_paths]
df_sample = pd.concat(df_list, ignore_index=True)
df_sample['det_raw_id'] = df_sample['det_raw_id'].apply(lambda x: list(map(int, x.split(' ')[:-1])))

detid_table_df = pd.read_csv(detid_table_path)
detid_table = np.sort(detid_table_df['det_raw_id'].unique())

detid_encoder = MultiLabelBinarizer(classes=detid_table)
detid_encoder.fit(df_sample['det_raw_id'])

track_scaler = StandardScaler()
track_scaler.fit(df_sample[['track_pt', 'track_eta', 'track_phi']].values)

?


In [35]:
from itertools import chain
det_raw_ids = np.unique(np.sort(np.array(list(chain.from_iterable(list(df_sample['det_raw_id'].values))))))

In [39]:
print(detid_table, len(detid_table))

print(det_raw_ids, len(det_raw_ids))

print(np.in1d(detid_table, det_raw_ids), len(np.in1d(detid_table, det_raw_ids)))

[574914560 574922752 574923776 ... 696274752 696275264 696291138] 13168
[574914560 574947328 574980096 ... 687907362 687907616 687907618] 3761
[ True False False ... False False False]
13168


In [43]:
detid_table_df['det_raw_id_match'] = np.in1d(detid_table, det_raw_ids)
detid_table_df.to_csv('/afs/cern.ch/user/j/joshin/public/TrackDetMatchmaker/CMSSW_14_2_0/src/TrackDetMatchmaker/Detector/test/det_raw_id_match.csv', index=False)

In [45]:
matched_det_table_df = detid_table_df[detid_table_df['det_raw_id_match']]

matched_det_table_df.to_csv('/afs/cern.ch/user/j/joshin/public/TrackDetMatchmaker/CMSSW_14_2_0/src/TrackDetMatchmaker/Detector/test/matched_det_raw_id.csv', index=False)