In [10]:
import os
import glob
import numpy as np
import pandas as pd
import joblib
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

def init_model_and_scaler(data_dir, detid_table_path, model_dir, n_sample_files=10):
    track_paths = sorted(glob.glob(os.path.join(data_dir, '*.csv')))
    sample_paths = track_paths[:n_sample_files]

    df_list = [pd.read_csv(p) for p in sample_paths]
    df_sample = pd.concat(df_list, ignore_index=True)
    df_sample['det_raw_id'] = df_sample['det_raw_id'].apply(lambda x: list(map(int, x.split(' ')[:-1])))

    detid_table_df = pd.read_csv(detid_table_path)
    detid_table = np.sort(detid_table_df['det_raw_id'].unique())

    detid_encoder = MultiLabelBinarizer(classes=detid_table)
    detid_encoder.fit(df_sample['det_raw_id'])

    track_scaler = StandardScaler()
    track_scaler.fit(df_sample[['track_pt', 'track_eta', 'track_phi']].values)

    X_sample = track_scaler.transform(df_sample[['track_pt', 'track_eta', 'track_phi']].values)
    Y_sample = detid_encoder.transform(df_sample['det_raw_id'])
    X_train, X_val, Y_train, Y_val = train_test_split(X_sample, Y_sample, test_size=0.1, random_state=42)

    model = XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, random_state=42)
    model.fit(X_train, Y_train, eval_set=[(X_val, Y_val)], verbose=True)
    
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    model.save_model(os.path.join(model_dir, 'model'))
    joblib.dump(track_scaler, os.path.join(model_dir, 'scaler.pkl'))
    joblib.dump(detid_encoder, os.path.join(model_dir, 'encoder.pkl'))

def load_data(file_paths, track_scaler, detid_encoder):
    df_list = [pd.read_csv(file_path) for file_path in file_paths]
    df = pd.concat(df_list, ignore_index=True)
    df['det_raw_id'] = df['det_raw_id'].apply(lambda x: list(map(int, x.split(' ')[:-1])))

    X = track_scaler.transform(df[['track_pt', 'track_eta', 'track_phi']].values)
    Y = detid_encoder.transform(df['det_raw_id'])
    return X, Y

def data_generator(file_paths, batch_size, track_scaler, detid_encoder):
    for i in range(0, len(file_paths), batch_size):
        batch_files = file_paths[i:i+batch_size]
        X_batch, Y_batch = load_data(batch_files, track_scaler, detid_encoder)
        yield X_batch, Y_batch

def train_model_and_scaler(data_dir, init_model_path, scaler_path, encoder_path, model_dir, n_file=100, batch_size=10):
    model = XGBClassifier()
    model.load_model(init_model_path)

    track_scaler = joblib.load(scaler_path)
    detid_encoder = joblib.load(encoder_path)

    track_paths = sorted(glob.glob(os.path.join(data_dir, '*.csv')))
    train_track_paths = track_paths[:n_file]

    for X_batch, Y_batch in data_generator(train_track_paths, batch_size, track_scaler, detid_encoder):
        X_train, X_val, Y_train, Y_val = train_test_split(X_batch, Y_batch, test_size=0.1, random_state=42)
        model.fit(X_train, Y_train, xgb_model=model.get_booster(), eval_set=[(X_val, Y_val)], verbose=True)
    
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    model.save_model(os.path.join(model_dir, 'XGBoost_FinalModel.json'))
    joblib.dump(track_scaler, os.path.join(model_dir, 'scaler.pkl'))
    joblib.dump(detid_encoder, os.path.join(model_dir, 'encoder.pkl'))

def evaluate_model_and_scaler(data_dir, model_path, scaler_path, encoder_path, n_eval_files=10):
    model = XGBClassifier()
    model.load_model(model_path)

    track_scaler = joblib.load(scaler_path)
    detid_encoder = joblib.load(encoder_path)

    track_paths = sorted(glob.glob(os.path.join(data_dir, '*.csv')))
    eval_paths = track_paths[-n_eval_files:]
    
    X_eval, Y_eval = load_data(eval_paths, track_scaler, detid_encoder)
    preds = model.predict(X_eval)
    accuracy = (preds == Y_eval).mean()
    return accuracy


In [11]:
if __name__ == '__main__':
    data_dir = '/users/hep/eigen1907/Workspace/Workspace-DL/241215-track_det_raw_id/TrackDetMatches'
    detid_table_path = '/users/hep/eigen1907/Workspace/Workspace-DL/241215-track_det_raw_id/muon_system_det_raw_id.csv'
    model_dir = '/users/hep/eigen1907/Workspace/Workspace-DL/241218-XGBoost/model'
    init_model_and_scaler(data_dir, detid_table_path, model_dir, n_sample_files=3)
    #init_model_path = os.path.join(model_dir, 'model.json')
    #scaler_path = os.path.join(model_dir, 'scaler.pkl')
    #encoder_path = os.path.join(model_dir, 'encoder.pkl')
    #train_model_and_scaler(data_dir, init_model_path, scaler_path, encoder_path, model_dir, n_file=100, batch_size=10)
    #final_model_path = os.path.join(model_dir, 'XGBoost_FinalModel.json')
    #eval_accuracy = evaluate_model_and_scaler(data_dir, final_model_path, scaler_path, encoder_path, n_eval_files=10)
    #print("Evaluation accuracy:", eval_accuracy)

[0]	validation_0-logloss:0.11579
[1]	validation_0-logloss:0.10428
[2]	validation_0-logloss:0.09400
[3]	validation_0-logloss:0.08481
[4]	validation_0-logloss:0.07658
[5]	validation_0-logloss:0.06919
[6]	validation_0-logloss:0.06256
[7]	validation_0-logloss:0.05661
[8]	validation_0-logloss:0.05125
[9]	validation_0-logloss:0.04643
[10]	validation_0-logloss:0.04208
[11]	validation_0-logloss:0.03817
[12]	validation_0-logloss:0.03464
[13]	validation_0-logloss:0.03145
[14]	validation_0-logloss:0.02858
[15]	validation_0-logloss:0.02599
[16]	validation_0-logloss:0.02364
[17]	validation_0-logloss:0.02152
[18]	validation_0-logloss:0.01961
[19]	validation_0-logloss:0.01788
[20]	validation_0-logloss:0.01631
[21]	validation_0-logloss:0.01489
[22]	validation_0-logloss:0.01361
[23]	validation_0-logloss:0.01245
[24]	validation_0-logloss:0.01140
[25]	validation_0-logloss:0.01045
[26]	validation_0-logloss:0.00959
[27]	validation_0-logloss:0.00881
[28]	validation_0-logloss:0.00811
[29]	validation_0-loglos

