In [6]:
# Data stuffs
import pickle 
import numpy as np
import pandas as pd
import pathlib 
import os, sys
from moxie.data.utils_ import load_data, standardize, de_standardize, normalize_profiles


# Make it look pretty
from tqdm.notebook import tqdm  

# ML Stuff
from scipy.stats import truncnorm


In [7]:
train_data, val_data, test_data = load_data(dataset_choice='SANDBOX_NO_VARIATIONS', file_loc='../../../moxie/data/processed/pedestal_profiles_ML_READY_ak_5052022_uncerts_mask.pickle')
(train_X, train_y, train_mask, train_radii, train_real_space_radii, train_ids, train_uncert), (val_X, val_y, val_mask, val_radii, val_real_space_radii, val_ids, val_uncert), (test_X, test_y, test_mask, test_radii, test_real_space_radii, test_ids, test_uncert) = train_data, val_data, test_data

machine_param_order = ['Q95', 'RGEO', 'CR0', 'VOLM', 'TRIU', 'TRIL', 'ELON', 'POHM', 'IPLA', 'BVAC', 'NBI', 'ICRH', 'ELER']

train_pulse_order = [int(x.split('/')[0]) for x in train_ids]
train_pulses = list(set(train_pulse_order))
train_pulse_idxs = [[index for index in range(len(train_pulse_order)) if train_pulse_order[index] == pulse] for pulse in train_pulses]

val_pulse_order = [int(x.split('/')[0]) for x in val_ids]
val_pulses = list(set(val_pulse_order))
val_pulse_idxs = [[index for index in range(len(val_pulse_order)) if val_pulse_order[index] == pulse] for pulse in val_pulses]

test_pulse_order = [int(x.split('/')[0]) for x in test_ids]
test_pulses = list(set(test_pulse_order))
test_pulse_idxs = [[index for index in range(len(test_pulse_order)) if test_pulse_order[index] == pulse] for pulse in test_pulses]



Collect neseps for each slice in train-val-test, and store and save as smaller subset. 

```
    dict_supervsied = {'train': {'mps': np.array((13, N)), 'neseps': np.array(N))}, 
                        'val': {'mps': np.array((13, N)), 'neseps': np.array(N))}, 
                        'test': {'mps': np.array((13, N)), 'neseps': np.array(N))}}
```

In [29]:
def calculate_neseps_in_set(set_ids, set_profiles, set_uncerts, set_masks, set_mps, set_shot_numbers_by_slice): 
    iterator = tqdm(range(len(set_ids)))
    
    nesep_means_by_pulse = []
    neseps_means = []
    mps_means = []
    for k in iterator: 
        slice_loc_in_set = set_ids[k]
        pulse_number = np.array(set_shot_numbers_by_slice)[set_ids[k]][0]
        
        sample_profiles, sample_uncerts, sample_masks, sample_mps = set_profiles[slice_loc_in_set], set_uncerts[slice_loc_in_set], set_masks[slice_loc_in_set], set_mps[slice_loc_in_set]
        pulse_neseps = calculate_nesep_for_pulse(sample_profiles, sample_uncerts, sample_masks, lbound_n = 0, ubound_n = 0.5e21, lbound_t=0, ubound_t=2000)
        nesep_means_by_pulse.extend(pulse_neseps)
        
        neseps_means.append(pulse_neseps.mean())
        mps_means.append(sample_mps.mean(0))
        iterator.set_description_str(str(pulse_number))        
    return np.array(nesep_means_by_pulse), np.array(neseps_means), np.array(mps_means)

def calculate_nesep_for_pulse(both_profiles, both_uncertanties, integer_masks, lbound_n, ubound_n, lbound_t, ubound_t, conditional_prediction=False):
    """
    This will return the neseps predicted for each time slice in the pulse. 
    """
    
    pulse_neseps = np.zeros(len(both_profiles))
    pulse_teseps = np.zeros(len(both_profiles))
    if conditional_prediction: 
        integer_masks = np.ones_like(integer_masks, dtype=bool)
        both_uncertanties = np.ones_like(both_uncertanties, dtype=bool)*200
    
    for n, (both_profiles_slice, both_uncertanties_slice, mask_int_slice) in enumerate(zip(both_profiles, both_uncertanties, integer_masks)):
        bool_mask = mask_int_slice > 0
        slice_ne, slice_te = both_profiles_slice[0, :][bool_mask], both_profiles_slice[1, :][bool_mask]
        slice_ne_uncert, slice_te_uncert = both_uncertanties_slice[0, :][bool_mask], both_uncertanties_slice[1, :][bool_mask]
        
        tes_gaussians = np.array([np.linspace(truncnorm.ppf(0.0001, (lbound_t - mu) / var, (ubound_t - mu) / var, mu, var), 
                                     truncnorm.ppf(0.9999, (lbound_t - mu) / var, (ubound_t - mu) / var, mu, var), 10000)
                        for mu, var in zip(slice_te, slice_te_uncert)])
        
        separatrix_loc = np.logical_and(tes_gaussians > 90, tes_gaussians < 110)
        tesep_in_separatrix = tes_gaussians[separatrix_loc]
        
        
        
        
        nes_gaussians = np.array([np.linspace(truncnorm.ppf(0.1, (lbound_n - mu) / var, (ubound_n - mu) / var, mu, var), 
                                     truncnorm.ppf(0.9, (lbound_n - mu) / var, (ubound_n - mu) / var, mu, var), 10000)
                        for mu, var in zip(slice_ne, slice_ne_uncert)])

        nes_in_separatrix = nes_gaussians[separatrix_loc]
        if len(nes_in_separatrix) == 0: 
            print(slice_ne, slice_te, slice_ne_uncert, slice_te_uncert)
        slice_nesep = np.mean(nes_in_separatrix)
        slice_tesep = np.mean(tesep_in_separatrix)
        pulse_neseps[n] = slice_nesep
        pulse_teseps[n] = slice_tesep
    
    return pulse_neseps
                    

In [30]:
train_neseps, train_neseps_mean, train_mps_mean = calculate_neseps_in_set(train_pulse_idxs, train_X, train_uncert, train_mask, train_y, train_pulse_order)
val_neseps, val_neseps_mean, val_mps_mean = calculate_neseps_in_set(val_pulse_idxs, val_X, val_uncert, val_mask, val_y, val_pulse_order)
test_neseps, test_neseps_mean, test_mps_mean = calculate_neseps_in_set(test_pulse_idxs, test_X, test_uncert, test_mask, test_y, test_pulse_order)

  0%|          | 0/533 [00:00<?, ?it/s]

  0%|          | 0/133 [00:00<?, ?it/s]

  0%|          | 0/77 [00:00<?, ?it/s]

In [35]:
print(len(train_neseps), len(train_y))
print(len(train_neseps_mean), train_mps_mean.shape)

22885 22885
533 (533, 13)


In [36]:
supervised_dict = {'train': {'mps': train_y, 'neseps': train_neseps, 'mps_pulse': train_mps_mean, 'neseps_pulse': train_neseps_mean}, 
                    'val': {'mps': val_y, 'neseps': val_neseps, 'mps_pulse': val_mps_mean, 'neseps_pulse': val_neseps_mean}, 
                    'test': {'mps': test_y, 'neseps': test_neseps, 'mps_pulse': test_mps_mean, 'neseps_pulse': test_neseps_mean}, 
                  'MP_ORDER': machine_param_order}

In [37]:
PERSONAL_DATA_DIR_PROC = '/home/kitadam/ENR_Sven/moxie/data/processed/'
with open(PERSONAL_DATA_DIR_PROC + 'supervised_set.pickle', 'wb') as file:
    pickle.dump(supervised_dict, file)
with open(PERSONAL_DATA_DIR_PROC + 'supervised_set.pickle', 'rb') as file:
    supervised_dict = pickle.load(file)

In [38]:
supervised_dict

{'train': {'mps': array([[3.2963583e+00, 2.8869312e+00, 9.3111372e-01, ..., 1.1229699e+07,
          0.0000000e+00, 3.3196598e+22],
         [3.3070524e+00, 2.8883126e+00, 9.3087339e-01, ..., 1.1240277e+07,
          0.0000000e+00, 3.3141165e+22],
         [3.3059418e+00, 2.8900638e+00, 9.3157303e-01, ..., 1.1241191e+07,
          0.0000000e+00, 3.3085839e+22],
         ...,
         [3.2609892e+00, 2.8754473e+00, 9.4554967e-01, ..., 2.0282006e+07,
          3.6276562e+06, 2.3830338e+22],
         [3.2822652e+00, 2.8756902e+00, 9.4688201e-01, ..., 1.8848416e+07,
          3.4138668e+06, 2.3187388e+22],
         [3.3058181e+00, 2.8758321e+00, 9.4882190e-01, ..., 1.6745158e+07,
          3.3308748e+06, 2.2554716e+22]], dtype=float32),
  'neseps': array([1.21896961e+19, 8.56472731e+18, 1.29973396e+19, ...,
         1.62349773e+19, 1.76692882e+19, 1.92025616e+19]),
  'mps_pulse': array([[4.3470349e+00, 2.8988640e+00, 9.2704594e-01, ..., 1.8879218e+07,
          0.0000000e+00, 5.6033939e+21