In [1]:
from ROOT import RDataFrame
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from skimage.io import imread, imshow
from tqdm import tqdm

from fftprep.fftprep import make_img, fft_filter, fft_filter_pairwise, reconstruct_img, complete_data_pw, load_from_sparse, fftpredictor, params_pairwise, assemble_sparse_data

from sklearn.neural_network import MLPRegressor

from scipy.signal import detrend

import pickle

from itertools import combinations

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

from scipy.stats import crystalball
from scipy.optimize import curve_fit

import re

In [11]:
# set parameters
params = {
    "X": {
        "range": (500,3500),
        "reso": 0.1
    }, 
    "Y": {
        "range": (500,2800),
        "reso": 0.1
    }
    , 
    "theta": {
        "range": (0,0.6),
        "reso": 100
    }, 
    "phi": {
        "range": (0,2*np.pi),
        "reso": 10
    }
}

# add all files in order of momentum
files = ["FullSim_1GeV.root", "FullSim_5GeV.root", "FullSim_10GeV.root"] 

# generate sparse data for storage
assemble_sparse_data(files, params)

In [4]:
def load_multi_from_sparse(bins, n_sigma=1, background_pw=None):
    return [
        load_from_sparse(return_full=True, n_sigma=n_sigma, background_pw=background_pw, p=int(p/1000)) \
        for p in bins
    ]
    

In [27]:
params = {
    "X": {
        "range": (500,3500),
        "reso": 0.1
    }, 
    "Y": {
        "range": (500,2800),
        "reso": 0.1
    }
    , 
    "theta": {
        "range": (0,0.6),
        "reso": 100
    }, 
    "phi": {
        "range": (0,2*np.pi),
        "reso": 10
    }
}

model = fftpredictor(load_multi_from_sparse([1000,5000,10000]), params=params)

In [26]:
class fftpredictor():
    def __init__(self, data=None, params=None, n_sigma=1, bins=None):
        if bins is None:
            self.bins = [1000, 5000, 10000]
        if data is None:
            self.data = load_multi_from_sparse(self.bins, n_sigma=n_sigma)
        else:
            self.data = data
        self.params = params
        if params is None:
            self.params = {
                "X": {
                    "range": (500,3500),
                    "reso": 0.1
                }, 
                "Y": {
                    "range": (500,2800),
                    "reso": 0.1
                }
                , 
                "theta": {
                    "range": (0,0.6),
                    "reso": 100
                }, 
                "phi": {
                    "range": (0,2*np.pi),
                    "reso": 10
                }
            }
        else:
            self.params = params
        self.coef_ = np.array([-6.01995295e-02, 4.65687551e-01, 7.69238905e-01, 7.93596378e-01,
                               8.39163281e-01, -2.57176367e-01, 4.76729827e-05, 9.34380346e-05,
                               1.21868165e-01, 7.07807293e-02])
        self.intercept_ = -984.5335006117008

    def predict(self, vec, p, mode="reg"):

        if p in self.bins:
            predictions_pw = []
            coords = list(combinations(vec, 2))
            bin_ind = self.bins.index(p)
            data = self.data[bin_ind]
            for i, key in enumerate(data.keys()):
                var1 = key.split("_")[0]
                var2 = key.split("_")[2]
                coords_scales = (
                    int((coords[i][0] - self.params[var1]["range"][0]) * self.params[var1]["reso"]),
                    int((coords[i][1] - self.params[var2]["range"][0]) * self.params[var2]["reso"])
                )
                predictions_pw.append(data[key][coords_scales])
            if mode=="all":
                return predictions_pw
            return np.sum(np.concatenate((predictions_pw, vec))*self.coef_[bin_ind]) + self.intercept_[bin_ind]
            
        else:
            nearest_bins_ind = (np.digitize(p, self.bins) - 1, np.digitize(p, self.bins))
            nearest_bins = (self.bins[nearest_bins_ind[0]], self.bins[nearest_bins_ind[1]])
            scale = abs(p - nearest_bins[0]) / (abs(p - nearest_bins[0]) + abs(p - nearest_bins[1]))
            predictions_pw = []
            coords = list(combinations(vec, 2))
            data_lower = self.data[nearest_bins_ind[0]]
            data_upper = self.data[nearest_bins_ind[1]]
            for i, key in enumerate(data_lower.keys()):
                var1 = key.split("_")[0]
                var2 = key.split("_")[2]
                coords_scaled = (
                    int((coords[i][0] - self.params[var1]["range"][0]) * self.params[var1]["reso"]),
                    int((coords[i][1] - self.params[var2]["range"][0]) * self.params[var2]["reso"])
                )
                predictions_pw.append(
                    data_lower[key][coords_scaled] * (1 - scale) + data_upper[key][
                        coords_scaled] * scale)
            inputs = np.concatenate((predictions_pw, vec))
            if mode=="all":
                return predictions_pw
            return (np.sum(inputs*self.coef_[nearest_bins_ind[0]])+ self.intercept_[nearest_bins_ind[0]])*(1-scale) \
                    + (np.sum(inputs*self.coef_[nearest_bins_ind[1]])+ self.intercept_[nearest_bins_ind[1]])*(1-scale)

    def fit_linear(self, inputs, outputs):
        self.coef_ = []
        self.intercept_ = []
        for i, p in enumerate(self.bins):
            print("[1/2] Getting predictions for each coordinate pair from FFT data...")
            preds = []
            for row in tqdm(inputs[i]):
                preds.append(self.predict((row[1], row[2], row[4], row[5]), p, mode="all"))
            print("[2/2] computing linear regression coefficients...")
            preds = np.array(preds)
            inputs_featurized = np.concatenate((preds, np.delete(inputs[i], [0, 3], axis=1)), axis=1)
            reg = LinearRegression().fit(inputs_featurized, outputs[i])
            self.coef_.append(reg.coef_)
            self.intercept_.append(reg.intercept_)


    def initialize(files)

In [13]:
def get_training_data(files):
    X_set = []
    obs_set = []
    for file in files:
        print(f"[1/2] Loading file: {file} (this may take a while) ...")
        p = int(re.search(r'\d+', file).group())
        data = RDataFrame("t;1", file).AsNumpy()
        X = np.array(
            [
                data["part_p"],
                data["part_x"],
                data["part_y"],
                data["part_z"],
                data["part_theta"],
                data["part_phi"]
            ]
        ).transpose()
        obs = data["cl_E_ecal"]
        print(f"[2/3] Subsetting data based on parameters: {params} ...")
        subset = (X.transpose()[1] > params["X"]["range"][0]) \
                 * (X.transpose()[1] < params["X"]["range"][1]) \
                 * (X.transpose()[2] > params["Y"]["range"][0]) \
                 * (X.transpose()[2] < params["Y"]["range"][1]) \
                 * (X.transpose()[3] > 12280) \
                 * (X.transpose()[3] < 12300) \
                 * (X.transpose()[4] > params["theta"]["range"][0]) \
                 * (X.transpose()[4] < params["theta"]["range"][1]) \
                 * (X.transpose()[5] > params["phi"]["range"][0]) \
                 * (X.transpose()[5] < params["phi"]["range"][1])
        X_set.append(X[subset])
        obs_set.append(obs[subset])
    return X_set, obs_set
        
        

In [33]:
model.fit_linear(*get_training_data(files))

[1/2] Loading file: FullSim_1GeV.root (this may take a while) ...
[2/3] Subsetting data based on parameters: {'X': {'range': (500, 3500), 'reso': 0.1}, 'Y': {'range': (500, 2800), 'reso': 0.1}, 'theta': {'range': (0, 0.6), 'reso': 100}, 'phi': {'range': (0, 6.283185307179586), 'reso': 10}} ...
[1/2] Loading file: FullSim_5GeV.root (this may take a while) ...
[2/3] Subsetting data based on parameters: {'X': {'range': (500, 3500), 'reso': 0.1}, 'Y': {'range': (500, 2800), 'reso': 0.1}, 'theta': {'range': (0, 0.6), 'reso': 100}, 'phi': {'range': (0, 6.283185307179586), 'reso': 10}} ...
[1/2] Loading file: FullSim_10GeV.root (this may take a while) ...
[2/3] Subsetting data based on parameters: {'X': {'range': (500, 3500), 'reso': 0.1}, 'Y': {'range': (500, 2800), 'reso': 0.1}, 'theta': {'range': (0, 0.6), 'reso': 100}, 'phi': {'range': (0, 6.283185307179586), 'reso': 10}} ...
[1/2] Getting predictions for each coordinate pair from FFT data...


100%|████████████████████████████| 60070237/60070237 [13:07<00:00, 76289.12it/s]


[2/2] computing linear regression coefficients...
[1/2] Getting predictions for each coordinate pair from FFT data...


100%|████████████████████████████| 60403703/60403703 [12:59<00:00, 77529.55it/s]


[2/2] computing linear regression coefficients...
[1/2] Getting predictions for each coordinate pair from FFT data...


100%|████████████████████████████| 25487788/25487788 [05:22<00:00, 79004.09it/s]


[2/2] computing linear regression coefficients...


In [37]:
coef = model.coef_
coef

[array([2.06760854e-01, 2.24318204e-01, 7.27489415e-01, 5.77515270e-01,
        6.82154470e-01, 1.99721306e-01, 1.28499286e-05, 1.53433174e-05,
        4.11704027e-02, 2.51585091e-02]),
 array([-5.68759353e-02,  4.58101256e-01,  7.74284749e-01,  7.95634036e-01,
         8.37574140e-01, -2.51841410e-01,  4.06784754e-05,  1.40460751e-04,
         8.57536271e-02,  6.60613376e-02]),
 array([ 1.61857691e-01,  4.24262543e-01,  7.06796277e-01,  8.09580389e-01,
         8.57487536e-01, -2.26864829e-01,  1.35914080e-04,  4.50624295e-04,
         4.22329996e-01,  6.73391904e-02])]

In [40]:
training_data

([array([[9.99999939e+02, 5.88440308e+02, 1.62765430e+03, 1.22857803e+04,
          1.39695823e-01, 4.72410631e+00],
         [1.00000000e+03, 1.64380469e+03, 1.66792468e+03, 1.22859258e+04,
          1.46638095e-01, 1.86546373e+00],
         [9.99999939e+02, 1.76707153e+03, 1.01445007e+03, 1.22835654e+04,
          1.66183084e-01, 5.41390240e-01],
         ...,
         [1.00000006e+03, 1.30967517e+03, 1.29966113e+03, 1.22845957e+04,
          1.64026365e-01, 3.87597466e+00],
         [1.00000000e+03, 3.32022876e+03, 1.46044116e+03, 1.22851768e+04,
          1.87364370e-01, 2.65906882e+00],
         [1.00000000e+03, 1.42315186e+03, 1.41152515e+03, 1.22850000e+04,
          5.94317496e-01, 8.48643303e-01]], dtype=float32),
  array([[5.0000000e+03, 2.6252981e+03, 2.5895183e+03, 1.2289256e+04,
          2.6834196e-01, 1.6430063e+00],
         [5.0000000e+03, 2.1472900e+03, 2.5148394e+03, 1.2288985e+04,
          5.8074760e-01, 5.8649855e+00],
         [5.0000000e+03, 3.0574438e+03, 8.942

In [None]:
# what to do next
# make function to set up model, callable from model
# make something so that predict doesnt need to be called in loops, compile data type function
# write doc strings


In [39]:
params = {
    "X": {
        "range": (500,3500),
        "reso": 0.1
    }, 
    "Y": {
        "range": (500,2800),
        "reso": 0.1
    }
    , 
    "theta": {
        "range": (0,0.6),
        "reso": 100
    }, 
    "phi": {
        "range": (0,2*np.pi),
        "reso": 10
    }
}

model = fftpredictor(load_multi_from_sparse([1000,5000,10000]), params=params)
model.coef_ = coef
model.intercept_ = intercept